sessionInfo()
## R version 3.5.2 (2018-12-20)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
##  [1] compiler_3.5.2  magrittr_1.5    tools_3.5.2     htmltools_0.3.6
##  [5] yaml_2.2.0      Rcpp_1.0.0      stringi_1.2.4   rmarkdown_1.11 
##  [9] knitr_1.21      stringr_1.3.1   xfun_0.4        digest_0.6.18  
## [13] evaluate_0.12

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

## input attrition file
setwd("~/Documents/SMU/Doing DS/Unit 14")

emp <- read.xlsx("CaseStudy2-data.xlsx", sheet = 1, startRow = 1, colNames = TRUE)
#emp <- read.xlsx("CaseStudy2-data.xlsx", sheet = 1, startRow = 1, colNames = TRUE, rowNames = FALSE, detectDates = FALSE, skipEmptyRows = TRUE, skipEmptyCols = TRUE, rows = NULL, cols = NULL, check.names = FALSE, namedRegion = NULL, na.strings = "NA", fillMergedCells = FALSE)

head(emp)
##   Age Attrition Attr_Ind    BusinessTravel BT1 BT2 DailyRate
## 1  41       Yes        1     Travel_Rarely   1   0      1102
## 2  49        No        0 Travel_Frequently   0   1       279
## 3  37       Yes        1     Travel_Rarely   1   0      1373
## 4  33        No        0 Travel_Frequently   0   1      1392
## 5  27        No        0     Travel_Rarely   1   0       591
## 6  32        No        0 Travel_Frequently   0   1      1005
##               Department DistanceFromHome Education EducationField
## 1                  Sales                1         2  Life Sciences
## 2 Research & Development                8         1  Life Sciences
## 3 Research & Development                2         2          Other
## 4 Research & Development                3         4  Life Sciences
## 5 Research & Development                2         1        Medical
## 6 Research & Development                2         2  Life Sciences
##   EmployeeCount EmployeeNumber EnvironmentSatisfaction Gender HourlyRate
## 1             1              1                       2 Female         94
## 2             1              2                       3   Male         61
## 3             1              4                       4   Male         92
## 4             1              5                       4 Female         56
## 5             1              7                       1   Male         40
## 6             1              8                       4   Male         79
##   JobInvolvement JobLevel               JobRole JobSatisfaction
## 1              3        2       Sales Executive               4
## 2              2        2    Research Scientist               2
## 3              2        1 Laboratory Technician               3
## 4              3        1    Research Scientist               3
## 5              3        1 Laboratory Technician               2
## 6              3        1 Laboratory Technician               4
##   MaritalStatus MonthlyIncome MonthlyRate NumCompaniesWorked Over18
## 1        Single          5993       19479                  8      Y
## 2       Married          5130       24907                  1      Y
## 3        Single          2090        2396                  6      Y
## 4       Married          2909       23159                  1      Y
## 5       Married          3468       16632                  9      Y
## 6        Single          3068       11864                  0      Y
##   OverTime OT_ind PercentSalaryHike PerformanceRating
## 1      Yes      1                11                 3
## 2       No      0                23                 4
## 3      Yes      1                15                 3
## 4      Yes      1                11                 3
## 5       No      0                12                 3
## 6       No      0                13                 3
##   RelationshipSatisfaction StandardHours StockOptionLevel
## 1                        1            80                0
## 2                        4            80                1
## 3                        2            80                0
## 4                        3            80                0
## 5                        4            80                1
## 6                        3            80                0
##   TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany
## 1                 8                     0               1              6
## 2                10                     3               3             10
## 3                 7                     3               3              0
## 4                 8                     3               3              8
## 5                 6                     3               3              2
## 6                 8                     2               2              7
##   YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
## 1                  4                       0                    5
## 2                  7                       1                    7
## 3                  0                       0                    0
## 4                  7                       3                    0
## 5                  2                       2                    2
## 6                  7                       3                    6
#Missing values check

#dimension
dim(emp)
## [1] 1470   39
#summary
summary(emp$MonthlyIncome)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1009    2911    4919    6503    8379   19999
summary(emp$Age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   18.00   30.00   36.00   36.92   43.00   60.00
summary(as.numeric(emp$YearsAtCompany))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   3.000   5.000   7.008   9.000  40.000
summary(emp$NumCompaniesWorked)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.000   2.000   2.693   4.000   9.000
summary(emp$DistanceFromHome)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   7.000   9.193  14.000  29.000
summary(emp$YearsInCurrentRole)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   2.000   3.000   4.229   7.000  18.000
summary(emp$JobSatisfaction)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   3.000   2.729   4.000   4.000
hist(emp$MonthlyIncome)

hist(emp$Age)

hist(as.numeric(emp$YearsAtCompany))

hist(emp$NumCompaniesWorked)

hist(emp$DistanceFromHome)

hist(emp$YearsInCurrentRole)

freq(emp$Gender)
##          n  % val%
## Female 588 40   40
## Male   882 60   60
freq(emp$Education)
##     n    % val%
## 1 170 11.6 11.6
## 2 282 19.2 19.2
## 3 572 38.9 38.9
## 4 398 27.1 27.1
## 5  48  3.3  3.3
freq(emp$JobRole)
##                             n    % val%
## Healthcare Representative 131  8.9  8.9
## Human Resources            52  3.5  3.5
## Laboratory Technician     259 17.6 17.6
## Manager                   102  6.9  6.9
## Manufacturing Director    145  9.9  9.9
## Research Director          80  5.4  5.4
## Research Scientist        292 19.9 19.9
## Sales Executive           326 22.2 22.2
## Sales Representative       83  5.6  5.6
## input age file
fit <- lm(Age ~ MonthlyIncome + NumCompaniesWorked + YearsAtCompany + OT_ind + BT1 + BT2 + Attr_Ind, data=emp)


summary(fit)
## 
## Call:
## lm(formula = Age ~ MonthlyIncome + NumCompaniesWorked + YearsAtCompany + 
##     OT_ind + BT1 + BT2 + Attr_Ind, data = emp)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16.3916  -5.4903  -0.9349   4.2522  26.7550 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        27.9418089  0.7173773  38.950  < 2e-16 ***
## MonthlyIncome       0.0007144  0.0000506  14.119  < 2e-16 ***
## NumCompaniesWorked  0.9776191  0.0818675  11.941  < 2e-16 ***
## YearsAtCompany      0.2082472  0.0384393   5.418 7.06e-08 ***
## OT_ind              1.2025377  0.4501058   2.672  0.00763 ** 
## BT1                 0.3568153  0.6581930   0.542  0.58782    
## BT2                 0.4648894  0.7681020   0.605  0.54511    
## Attr_Ind           -2.7124446  0.5644751  -4.805 1.70e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.518 on 1462 degrees of freedom
## Multiple R-squared:  0.326,  Adjusted R-squared:  0.3227 
## F-statistic:   101 on 7 and 1462 DF,  p-value: < 2.2e-16
# Assessing Outliers
#outlierTest(fit) # Bonferonni p-value for most extreme obs
qqPlot(fit, main="QQ Plot")  #qq plot for studentized resid 

## [1]  939 1355
leveragePlots(fit) # leverage plots

# Normality of Residuals

# distribution of studentized residuals
library(MASS)
sresid <- studres(fit) 
hist(sresid, freq=FALSE, main="Distribution of Studentized Residuals")
xfit<-seq(min(sresid),max(sresid),length=40) 
yfit<-dnorm(xfit) 
lines(xfit, yfit)

# Evaluate homoscedasticity
# non-constant error variance test
ncvTest(fit)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 0.4922889, Df = 1, p = 0.48291
# plot studentized residuals vs. fitted values 
spreadLevelPlot(fit)
## 
## Suggested power transformation:  0.7645897
# Evaluate Collinearity
vif(fit) # variance inflation factors 
##      MonthlyIncome NumCompaniesWorked     YearsAtCompany 
##           1.475043           1.086961           1.441393 
##             OT_ind                BT1                BT2 
##           1.069121           2.322108           2.346506 
##           Attr_Ind 
##           1.120636
sqrt(vif(fit)) > 2 # problem?
##      MonthlyIncome NumCompaniesWorked     YearsAtCompany 
##              FALSE              FALSE              FALSE 
##             OT_ind                BT1                BT2 
##              FALSE              FALSE              FALSE 
##           Attr_Ind 
##              FALSE
# Evaluate homoscedasticity
# non-constant error variance test
ncvTest(fit)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 0.4922889, Df = 1, p = 0.48291
# plot studentized residuals vs. fitted values 
spreadLevelPlot(fit)

## 
## Suggested power transformation:  0.7645897
# Evaluate Nonlinearity
# component + residual plot 
crPlots(fit)

# Ceres plots 
#ceresPlots(fit)

# Test for Autocorrelated Errors
durbinWatsonTest(fit)
##  lag Autocorrelation D-W Statistic p-value
##    1     -0.01150988       2.02301   0.634
##  Alternative hypothesis: rho != 0
# Global test of model assumptions
library(gvlma)
gvmodel <- gvlma(fit) 
summary(gvmodel)
## 
## Call:
## lm(formula = Age ~ MonthlyIncome + NumCompaniesWorked + YearsAtCompany + 
##     OT_ind + BT1 + BT2 + Attr_Ind, data = emp)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16.3916  -5.4903  -0.9349   4.2522  26.7550 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        27.9418089  0.7173773  38.950  < 2e-16 ***
## MonthlyIncome       0.0007144  0.0000506  14.119  < 2e-16 ***
## NumCompaniesWorked  0.9776191  0.0818675  11.941  < 2e-16 ***
## YearsAtCompany      0.2082472  0.0384393   5.418 7.06e-08 ***
## OT_ind              1.2025377  0.4501058   2.672  0.00763 ** 
## BT1                 0.3568153  0.6581930   0.542  0.58782    
## BT2                 0.4648894  0.7681020   0.605  0.54511    
## Attr_Ind           -2.7124446  0.5644751  -4.805 1.70e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.518 on 1462 degrees of freedom
## Multiple R-squared:  0.326,  Adjusted R-squared:  0.3227 
## F-statistic:   101 on 7 and 1462 DF,  p-value: < 2.2e-16
## 
## 
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance =  0.05 
## 
## Call:
##  gvlma(x = fit) 
## 
##                      Value   p-value                   Decision
## Global Stat        147.935 0.0000000 Assumptions NOT satisfied!
## Skewness           120.341 0.0000000 Assumptions NOT satisfied!
## Kurtosis             8.124 0.0043692 Assumptions NOT satisfied!
## Link Function       17.050 0.0000364 Assumptions NOT satisfied!
## Heteroscedasticity   2.419 0.1198383    Assumptions acceptable.
## input Attrition 
fitAttr <- lm( MonthlyIncome ~ Age  + StockOptionLevel + NumCompaniesWorked + YearsAtCompany + OT_ind + YearsInCurrentRole + YearsWithCurrManager + Attr_Ind, data=emp)


summary(fitAttr)
## 
## Call:
## lm(formula = MonthlyIncome ~ Age + StockOptionLevel + NumCompaniesWorked + 
##     YearsAtCompany + OT_ind + YearsInCurrentRole + YearsWithCurrManager + 
##     Attr_Ind, data = emp)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10919.4  -2250.0   -495.8   1490.2  12852.3 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -2049.28     427.68  -4.792 1.82e-06 ***
## Age                    165.68      11.89  13.935  < 2e-16 ***
## StockOptionLevel      -131.09     112.64  -1.164  0.24469    
## NumCompaniesWorked     197.98      41.15   4.811 1.66e-06 ***
## YearsAtCompany         388.35      28.50  13.628  < 2e-16 ***
## OT_ind                 189.46     218.27   0.868  0.38553    
## YearsInCurrentRole     -29.71      42.50  -0.699  0.48464    
## YearsWithCurrManager  -120.66      43.98  -2.743  0.00616 ** 
## Attr_Ind              -908.43     276.27  -3.288  0.00103 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3635 on 1461 degrees of freedom
## Multiple R-squared:  0.4072, Adjusted R-squared:  0.404 
## F-statistic: 125.4 on 8 and 1461 DF,  p-value: < 2.2e-16
# Assessing Outliers
#outlierTest(fit) # Bonferonni p-value for most extreme obs

qqPlot(fitAttr, main="QQ Plot")  #qq plot for studentized resid 

## [1]  498 1130
leveragePlots(fitAttr) # leverage plots

# Normality of Residuals

# distribution of studentized residuals
library(MASS)
sresid <- studres(fitAttr) 
hist(sresid, freq=FALSE, main="Distribution of Studentized Residuals")
xfit<-seq(min(sresid),max(sresid),length=40) 
yfit<-dnorm(xfit) 
lines(xfit, yfit)

# Evaluate homoscedasticity
# non-constant error variance test
ncvTest(fitAttr)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 244.717, Df = 1, p = < 2.22e-16
# plot studentized residuals vs. fitted values 
spreadLevelPlot(fitAttr)
## 
## Suggested power transformation:  0.03038701
# Evaluate Collinearity
vif(fitAttr) # variance inflation factors 
##                  Age     StockOptionLevel   NumCompaniesWorked 
##             1.311738             1.024351             1.175033 
##       YearsAtCompany               OT_ind   YearsInCurrentRole 
##             3.389324             1.075610             2.636556 
## YearsWithCurrManager             Attr_Ind 
##             2.738484             1.148485
sqrt(vif(fitAttr)) > 2 # problem?
##                  Age     StockOptionLevel   NumCompaniesWorked 
##                FALSE                FALSE                FALSE 
##       YearsAtCompany               OT_ind   YearsInCurrentRole 
##                FALSE                FALSE                FALSE 
## YearsWithCurrManager             Attr_Ind 
##                FALSE                FALSE
# Evaluate homoscedasticity
# non-constant error variance test
ncvTest(fitAttr)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 244.717, Df = 1, p = < 2.22e-16
# plot studentized residuals vs. fitted values 
spreadLevelPlot(fitAttr)

## 
## Suggested power transformation:  0.03038701
# Evaluate Nonlinearity
# component + residual plot 
crPlots(fitAttr)

# Ceres plots 
#ceresPlots(fitAttr)

# Test for Autocorrelated Errors
durbinWatsonTest(fitAttr)
##  lag Autocorrelation D-W Statistic p-value
##    1     -0.01969786      2.039285   0.458
##  Alternative hypothesis: rho != 0
# Global test of model assumptions
library(gvlma)
gvmodel <- gvlma(fitAttr) 
summary(gvmodel)
## 
## Call:
## lm(formula = MonthlyIncome ~ Age + StockOptionLevel + NumCompaniesWorked + 
##     YearsAtCompany + OT_ind + YearsInCurrentRole + YearsWithCurrManager + 
##     Attr_Ind, data = emp)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10919.4  -2250.0   -495.8   1490.2  12852.3 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -2049.28     427.68  -4.792 1.82e-06 ***
## Age                    165.68      11.89  13.935  < 2e-16 ***
## StockOptionLevel      -131.09     112.64  -1.164  0.24469    
## NumCompaniesWorked     197.98      41.15   4.811 1.66e-06 ***
## YearsAtCompany         388.35      28.50  13.628  < 2e-16 ***
## OT_ind                 189.46     218.27   0.868  0.38553    
## YearsInCurrentRole     -29.71      42.50  -0.699  0.48464    
## YearsWithCurrManager  -120.66      43.98  -2.743  0.00616 ** 
## Attr_Ind              -908.43     276.27  -3.288  0.00103 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3635 on 1461 degrees of freedom
## Multiple R-squared:  0.4072, Adjusted R-squared:  0.404 
## F-statistic: 125.4 on 8 and 1461 DF,  p-value: < 2.2e-16
## 
## 
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance =  0.05 
## 
## Call:
##  gvlma(x = fitAttr) 
## 
##                        Value p-value                   Decision
## Global Stat        3.100e+02  0.0000 Assumptions NOT satisfied!
## Skewness           2.009e+02  0.0000 Assumptions NOT satisfied!
## Kurtosis           1.085e+02  0.0000 Assumptions NOT satisfied!
## Link Function      4.837e-03  0.9446    Assumptions acceptable.
## Heteroscedasticity 4.758e-01  0.4903    Assumptions acceptable.
library(tidyr)  # data tidying (e.g., spread)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:RCurl':
## 
##     complete
## The following object is masked from 'package:pastecs':
## 
##     extract
## The following object is masked from 'package:magrittr':
## 
##     extract
# explore status/terminations by Department
#status_count <- with(emp, table(Department, Attrition))
#status_count <- spread(data.frame(status_count), Attrition, Freq)
#status_count$previous_active <- shift(status_count$ACTIVE, 1L, type = "lag")
#status_count$percent_terminated <- 100*status_count$Yes / status_count$previous_active
#status_count



# chart data in questions
#ggplot() + geom_bar(aes(y = ..count..,x = STATUS_YEAR, fill = termreason_desc), data=terms, position = position_stack()) +
#  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

# "Age","BusinessTravel","Department","DistanceFromHome","Education","EducationField", "Gender", "JobInvolvement", "JobLevel", "JobRole", "JobSatisfaction", "MaritalStatus", "MonthlyIncome", "NumCompaniesWorked",  "Overtime", "PercentSalaryHike","PerformanceRating", "RelationshipSatisfaction", "StockOptionLevel", "TotalWorkingYears","TrainingTimesLastYear", "WorkLifeBalance","YearsAtCompany","YearsInCurrentRole","YearsSinceLastPromotion","YearsWithCurrManager","Attrition"

# Partition the data into training and test sets
term_vars <- c("Age","Education","MonthlyIncome","BusinessTravel","Department","DistanceFromHome","EducationField", "Gender","JobInvolvement","JobLevel", "JobRole", "JobSatisfaction", "MaritalStatus", "MonthlyIncome","NumCompaniesWorked", "OverTime","PercentSalaryHike","PerformanceRating", "StockOptionLevel", "TotalWorkingYears","TrainingTimesLastYear", "WorkLifeBalance","RelationshipSatisfaction","YearsAtCompany","YearsInCurrentRole", "YearsSinceLastPromotion","YearsWithCurrManager","Attrition")
emp_term_train <- subset(emp, Education < 4)
emp_term_test <- subset(emp, Education >= 4)
set.seed(99)  # set a pre-defined value for the random seed so that results are repeatable
# Decision tree model
rpart_model <- rpart(Attrition ~.,
                     data = emp_term_train[term_vars],
                     method = 'class',
                     parms = list(split='information'),
                     control = rpart.control(usesurrogate = 0,
                                             maxsurrogate = 0))
# Plot the decision tree
rpart.plot(rpart_model, roundint = FALSE, type = 3)

##

# plot terminated & active by age
emp$resigned <- as.factor(emp$Attrition)
summary(emp$resigned)
##   No  Yes 
## 1233  237
featurePlot(x=emp$Age, y=emp$resigned, plot="density",
      auto.key = list(columns = 2), labels = c("Age (years)", ""))

##

# plot terminated & active by MonthlyIncome

featurePlot(x=emp$MonthlyIncome, y=emp$resigned, plot="density",
      auto.key = list(columns = 2), labels = c("Monthly Income", ""))

##

featurePlot(x=emp$YearsAtCompany, y=emp$resigned, plot="density",
      auto.key = list(columns = 2), labels = c("Monthly Income", ""))

Summary statistics for the ABV variable.

# Subset the data 
emp_cat <- data.frame(emp$Age,emp$Education,emp$MonthlyIncome,emp$BusinessTravel,emp$Department,emp$DistanceFromHome,emp$EducationField, emp$Gender,emp$JobInvolvement,emp$JobLevel, emp$JobRole, emp$JobSatisfaction, emp$MaritalStatus, emp$MonthlyIncome,emp$NumCompaniesWorked, emp$OverTime,emp$PercentSalaryHike,emp$PerformanceRating, emp$StockOptionLevel, emp$TotalWorkingYears,emp$TrainingTimesLastYear, emp$WorkLifeBalance,emp$RelationshipSatisfaction,emp$YearsAtCompany,emp$YearsInCurrentRole, emp$YearsSinceLastPromotion,emp$YearsWithCurrManager,emp$resigned,emp$EmployeeNumber)
#emp_cat <- data.frame(emp$Age,emp$Education,emp$MonthlyIncome,emp$DistanceFromHome,emp$JobInvolvement,emp$JobLevel, emp$JobRole, emp$JobSatisfaction, emp$MaritalStatus, emp$NumCompaniesWorked, emp$PercentSalaryHike,emp$PerformanceRating, emp$StockOptionLevel, emp$TotalWorkingYears,emp$TrainingTimesLastYear, emp$WorkLifeBalance,emp$RelationshipSatisfaction,emp$YearsAtCompany,emp$YearsInCurrentRole,emp$resigned,emp$EmployeeNumber)

col_headings <- c("Age","Education","MonthlyIncome","BusinessTravel","Department","DistanceFromHome","EducationField", "Gender","JobInvolvement","JobLevel", "JobRole", "JobSatisfaction", "MaritalStatus", "MonthlyIncome","NumCompaniesWorked", "OverTime","PercentSalaryHike","PerformanceRating", "StockOptionLevel", "TotalWorkingYears","TrainingTimesLastYear", "WorkLifeBalance","RelationshipSatisfaction","YearsAtCompany","YearsInCurrentRole", "YearsSinceLastPromotion","YearsWithCurrManager","resigned","EmployeeNumber")
names(emp_cat) <- col_headings
emp_train <- subset(emp_cat, Education < 4)
emp_test <- subset(emp_cat, Education >= 4)

emp_train_rose <- ROSE(resigned ~ ., data = emp_train, seed=125)$data
# Tables to show balanced dataset sample sizes
table(emp_train_rose$resigned)
## 
##  No Yes 
## 521 503
# Select variables (res_vars) for the model to predict 'resigned'
#res_vars <- c("Age","Education","MonthlyIncome","DistanceFromHome","JobInvolvement","JobLevel", "JobRole", "JobSatisfaction", "NumCompaniesWorked", "PercentSalaryHike","PerformanceRating", "StockOptionLevel", "TotalWorkingYears","TrainingTimesLastYear",  "WorkLifeBalance","RelationshipSatisfaction","YearsAtCompany","YearsInCurrentRole","resigned")
res_vars <- c("Age","Education","MonthlyIncome","BusinessTravel","Department","DistanceFromHome","EducationField", "Gender","JobInvolvement","JobLevel", "JobRole", "JobSatisfaction", "MaritalStatus", "MonthlyIncome","NumCompaniesWorked", "OverTime","PercentSalaryHike","PerformanceRating", "StockOptionLevel", "TotalWorkingYears","TrainingTimesLastYear", "WorkLifeBalance","RelationshipSatisfaction","YearsAtCompany","YearsInCurrentRole", "YearsSinceLastPromotion","YearsWithCurrManager","resigned")
set.seed(222)
emp_res_rose_RF <- randomForest(resigned ~ .,
                           data = emp_train_rose[res_vars],
                           ntree=500, importance = TRUE,
                           na.action = na.omit)
varImpPlot(emp_res_rose_RF,type=1,
           main="Variable Importance (Accuracy)",
           sub = "Random Forest Model", class = NULL, scale=TRUE)

pairs(~Age+MonthlyIncome+YearsAtCompany+YearsInCurrentRole,data=emp, 
   main="Simple Scatterplot Matrix")

#var_importance <- importance(emp_res_rose_RF)
#emp_res_rose_RF  # view results & Confusion matrix
# generate predictions based on test data ("emp_test")
emp_res_rose_RF_pred <- predict(emp_res_rose_RF, newdata = emp_test)
confusionMatrix(data = emp_res_rose_RF_pred,
                reference = emp_test$resigned,
                positive = "Yes", mode = "prec_recall")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  321  25
##        Yes  62  38
##                                           
##                Accuracy : 0.8049          
##                  95% CI : (0.7651, 0.8407)
##     No Information Rate : 0.8587          
##     P-Value [Acc > NIR] : 0.9992922       
##                                           
##                   Kappa : 0.3544          
##  Mcnemar's Test P-Value : 0.0001136       
##                                           
##               Precision : 0.3800          
##                  Recall : 0.6032          
##                      F1 : 0.4663          
##              Prevalence : 0.1413          
##          Detection Rate : 0.0852          
##    Detection Prevalence : 0.2242          
##       Balanced Accuracy : 0.7206          
##                                           
##        'Positive' Class : Yes             
## 

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

# Calculate prediction probabilites of employees who will resign
emp_res_rose_RF_pred_probs <- predict(emp_res_rose_RF, emp_test, type="prob")
Employees_flight_risk <- as.data.frame(cbind(emp_test$EmployeeNumber,emp_res_rose_RF_pred_probs))
col_headings <- c("EmployeeNumber","No","Yes")
names(Employees_flight_risk) <- col_headings
Employees_flight_risk <- arrange(Employees_flight_risk, desc(Yes))
Employees_flight_risk[1:25, ]
##    EmployeeNumber    No   Yes
## 1             939 0.034 0.966
## 2              65 0.070 0.930
## 3             911 0.070 0.930
## 4            1286 0.076 0.924
## 5             493 0.078 0.922
## 6            1556 0.090 0.910
## 7            2003 0.100 0.900
## 8             394 0.120 0.880
## 9            1318 0.142 0.858
## 10            925 0.144 0.856
## 11           1200 0.148 0.852
## 12           1704 0.150 0.850
## 13           1157 0.154 0.846
## 14             51 0.156 0.844
## 15            964 0.158 0.842
## 16            773 0.160 0.840
## 17           1055 0.160 0.840
## 18           1158 0.168 0.832
## 19           1862 0.168 0.832
## 20           1203 0.178 0.822
## 21             75 0.182 0.818
## 22            426 0.182 0.818
## 23           1405 0.186 0.814
## 24           1427 0.192 0.808
## 25            440 0.204 0.796
write.csv(Employees_flight_risk[1:25, ], file = "FlightRiskData.csv")
tail(Employees_flight_risk)
##     EmployeeNumber    No   Yes
## 441           1630 0.886 0.114
## 442            724 0.892 0.108
## 443           1484 0.900 0.100
## 444             98 0.904 0.096
## 445            170 0.914 0.086
## 446           1728 0.914 0.086